library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
library(p8105.datasets)
data("weather_df")
As a starting point, let’s revisit the scatterplot of tmax against tmin made in Visualization Pt 1.
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package"
)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package") +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15º C", "0", "15"))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Another way to do limits:
weather_df |>
filter(tmax > 10 , tmax < 30) |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package") +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15º C", "0", "15"))
######### Limits
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package") +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15ºC", "0", "15"),
limits = c(-20, 30)) +
scale_y_continuous(
trans = "sqrt",
position = "right")
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_continuous(trans = "sqrt", position = "right"): sqrt
## transformation introduced infinite values.
## Warning: Removed 142 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Changing the colors manually using scale_color_hue
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package") +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15º C", "0", "15")) +
scale_color_hue(h=c(100,300))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
# scale_color_hue(h=c(100,300))
#Better way - Jeff suggests always using viridis
ggp_temperature =
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = .5) +
labs(
title = "Temperature plot",
x = "Minimum daily temperature (C)",
y = "Maxiumum daily temperature (C)",
color = "Location",
caption = "Data from the rnoaa package") +
scale_x_continuous(
breaks = c(-15, 0, 15),
labels = c("-15º C", "0", "15")) +
viridis::scale_color_viridis(
name = "Location",
discrete = TRUE
)
ggsave("weather_scatterplot.png", ggp_temperature)
## Saving 7 x 5 in image
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
#saving it as ggp_temperature so that you dont have to keep adding to this code specifically
#use this structure name = .... if you want to save specific graphics or if your code chunk is getting large
Update my base plot
# Jeff's classic and go to
ggp_temperature +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Another possibility
ggp_temperature +
theme_dark() +
theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
# And another!
ggp_temperature +
theme_classic() +
theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
#they have packages where you can make it look like excel 2003 or wes anderson movies.
Jeff puts this in every to automatically set how he wants his figures to look
library(tidyverse)
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
central_park_df =
weather_df |>
filter(name == "CentralPark_NY")
molokai_df =
weather_df |>
filter(name == "Molokai_HI")
ggplot(data = molokai_df, aes(x = date, y = tmax, color = name)) +
geom_point()+
geom_line(data = central_park_df)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
patchworkMake three plots separatemes and combine using patchwork
ggp_tmax_tmin =
weather_df |>
ggplot(aes(x = tmin, y = tmax, colour = name)) +
geom_point(alpha = 0.5) +
theme(legend.position = "none")
ggp_prcp_density =
weather_df |>
filter(prcp > 0) |> #because there is that huge spike at 0. that we dont really care about
ggplot(aes(x = prcp, fill = name)) +
geom_density(alpha = .5) +
theme(legend.position = "none")
ggp_temp_season = #shows seasonality
weather_df |>
ggplot(aes(x = date, y = tmax, color = name)) +
geom_point(alpha = .5) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom")
(ggp_tmax_tmin + ggp_prcp_density) # this is the frame work for patchwork: this plus this place next to one another
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
(ggp_tmax_tmin + ggp_prcp_density) / ggp_temp_season #you can divide to see them stacked on top of one another
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
Let’s make temperature violin plots.
weather_df |>
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
# factor variables -
GGplot will assign number to strings based on alphabetical order. In
order to change this, use the following
mutate([] =fct_relevel([]))
weather_df |>
mutate(name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_WA"))) |>
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
#fct_relevel says take your name variable and put the values w/i in the following order (L->R) reads (1->3) mutate this so this becuase the new [name] variable
# Another way that Jeff does it (and more frequently)
weather_df |>
mutate(name = fct_reorder(name, tmax)) |>
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin(alpha = 0.5)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
#fact_reorder especially helpful if you have a ton of categories
#here, organize [name] by ascending tmax value
#Tidy the data pre ggploting What about data tidiness?
Suppose I want to ask does the distribution of BDI indiex change across visits? To do this we need a box plot
BUT – right now, or data are not structured this
pulse_df =
haven::read_sas("data/public_pulse_data.sas7bdat") |>
janitor::clean_names() |>
pivot_longer(
bdi_score_bl:bdi_score_12m,
names_to = "visit",
names_prefix = "bdi_score_",
values_to = "bdi"
) |>
mutate(visit = replace(visit, visit == "bl", "00m"),
visit = fct_inorder(visit)
)
#need to include haven:: becuase we didnt use library(haven) at begining
# fct_inorder() saying order based on how it shows up in the dataset
How would I make the following graphic (look at the screen):
patchwork two different litters 7 on top and 8 on bottom dose (con, low, mod) on x axis pn_day y axis (day on which this happened, post natal days for…..)
Looking at four different things: ears, pivot, eyes, walk
faceting for these two: the groups (days of treatment) + outcome
pups_df =
read_csv("data/FAS_pups.csv", na = c("NA", ".", ""), skip = 3) |>
janitor::clean_names() |>
mutate(
sex = case_match(
sex,
1 ~ "male",
2 ~ "female"
)
)
## Rows: 313 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Litter Number
## dbl (5): Sex, PD ears, PD eyes, PD pivot, PD walk
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
litters_df =
read_csv("data/FAS_litters.csv", na = c("NA", ".", "")) |>
janitor::clean_names() |>
separate(group, into = c("dose","tx_day"), sep = 3)
## Rows: 49 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Group, Litter Number
## dbl (6): GD0 weight, GD18 weight, GD of Birth, Pups born alive, Pups dead @ ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fas_df =
left_join(pups_df, litters_df, by = "litter_number")
fas_df |>
select(pd_ears:tx_day) |>
pivot_longer(
pd_ears:pd_walk,
names_to = "outcome",
names_prefix = "pd_",
values_to = "pn_day"
) |>
mutate(outcome = fct_reorder(outcome, pn_day)) |>
drop_na() |>
ggplot(aes(x = dose, y = pn_day))+
geom_violin() +
facet_grid(tx_day ~ outcome)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `outcome = fct_reorder(outcome, pn_day)`.
## Caused by warning:
## ! `fct_reorder()` removing 44 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
#Note that the ggplot code is only three lines long. Everything else is data manipulation!